DDSAnalytics is an analytics company that specializes in talent management solutions for Fortune 100 companies. DDSAnalytics has taken on a contract with Frito Lay to tackle predicting employee turnover. Analyzing an existing employee dataset of 870 unique observations with 36 categories, several visualization and models were generated to determine the top three factors leading to attrition.
library(readr)
employeeData <- read.csv("CaseStudy2-data.csv")
# list rows of data that have missing values
employeeData[!complete.cases(employeeData),]
[1] ID Age Attrition
[4] BusinessTravel DailyRate Department
[7] DistanceFromHome Education EducationField
[10] EmployeeCount EmployeeNumber EnvironmentSatisfaction
[13] Gender HourlyRate JobInvolvement
[16] JobLevel JobRole JobSatisfaction
[19] MaritalStatus MonthlyIncome MonthlyRate
[22] NumCompaniesWorked Over18 OverTime
[25] PercentSalaryHike PerformanceRating RelationshipSatisfaction
[28] StandardHours StockOptionLevel TotalWorkingYears
[31] TrainingTimesLastYear WorkLifeBalance YearsAtCompany
[34] YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
<0 rows> (or 0-length row.names)
#Check the data type of the variables in the file
glimpse(employeeData)
Rows: 870
Columns: 36
$ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14~
$ Age <int> 32, 40, 35, 32, 24, 27, 41, 37, 34, 34, 43, 2~
$ Attrition <chr> "No", "No", "No", "No", "No", "No", "No", "No~
$ BusinessTravel <chr> "Travel_Rarely", "Travel_Rarely", "Travel_Fre~
$ DailyRate <int> 117, 1308, 200, 801, 567, 294, 1283, 309, 133~
$ Department <chr> "Sales", "Research & Development", "Research ~
$ DistanceFromHome <int> 13, 14, 18, 1, 2, 10, 5, 10, 10, 10, 6, 1, 7,~
$ Education <int> 4, 3, 2, 4, 1, 2, 5, 4, 4, 4, 3, 2, 3, 1, 2, ~
$ EducationField <chr> "Life Sciences", "Medical", "Life Sciences", ~
$ EmployeeCount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ EmployeeNumber <int> 859, 1128, 1412, 2016, 1646, 733, 1448, 1105,~
$ EnvironmentSatisfaction <int> 2, 3, 3, 3, 1, 4, 2, 4, 3, 4, 1, 3, 3, 3, 4, ~
$ Gender <chr> "Male", "Male", "Male", "Female", "Female", "~
$ HourlyRate <int> 73, 44, 60, 48, 32, 32, 90, 88, 87, 92, 81, 4~
$ JobInvolvement <int> 3, 2, 3, 3, 3, 3, 4, 2, 3, 2, 2, 3, 3, 3, 3, ~
$ JobLevel <int> 2, 5, 3, 3, 1, 3, 1, 2, 1, 2, 5, 1, 3, 1, 1, ~
$ JobRole <chr> "Sales Executive", "Research Director", "Manu~
$ JobSatisfaction <int> 4, 3, 4, 4, 4, 1, 3, 4, 3, 3, 3, 4, 3, 2, 1, ~
$ MaritalStatus <chr> "Divorced", "Single", "Single", "Married", "S~
$ MonthlyIncome <int> 4403, 19626, 9362, 10422, 3760, 8793, 2127, 6~
$ MonthlyRate <int> 9250, 17544, 19944, 24032, 17218, 4809, 5561,~
$ NumCompaniesWorked <int> 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 7, 1, 3, 1, 6, ~
$ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", ~
$ OverTime <chr> "No", "No", "No", "No", "Yes", "No", "Yes", "~
$ PercentSalaryHike <int> 11, 14, 11, 19, 13, 21, 12, 14, 19, 14, 13, 1~
$ PerformanceRating <int> 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, ~
$ RelationshipSatisfaction <int> 3, 1, 3, 3, 3, 3, 1, 3, 4, 2, 4, 2, 2, 1, 3, ~
$ StandardHours <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8~
$ StockOptionLevel <int> 1, 0, 0, 2, 0, 2, 0, 3, 1, 1, 0, 1, 0, 1, 0, ~
$ TotalWorkingYears <int> 8, 21, 10, 14, 6, 9, 7, 8, 1, 8, 21, 3, 17, 1~
$ TrainingTimesLastYear <int> 3, 2, 2, 3, 2, 4, 5, 5, 2, 3, 2, 2, 3, 3, 3, ~
$ WorkLifeBalance <int> 2, 4, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, ~
$ YearsAtCompany <int> 5, 20, 2, 14, 6, 9, 4, 1, 1, 8, 16, 3, 8, 1, ~
$ YearsInCurrentRole <int> 2, 7, 2, 10, 3, 7, 2, 0, 1, 2, 12, 2, 5, 0, 6~
$ YearsSinceLastPromotion <int> 0, 4, 2, 5, 1, 1, 0, 0, 0, 7, 6, 2, 1, 0, 5, ~
$ YearsWithCurrManager <int> 3, 9, 2, 7, 3, 7, 3, 0, 0, 7, 14, 2, 6, 0, 7,~
# count and percent
Gender_Job_table = employeeData %>%
count(Gender, JobSatisfaction) %>%
group_by(Gender) %>%
mutate(proportion = n/sum(n))
Gender_Job_table
# A tibble: 8 x 4
# Groups: Gender [2]
Gender JobSatisfaction n proportion
<chr> <int> <int> <dbl>
1 Female 1 71 0.201
2 Female 2 76 0.215
3 Female 3 102 0.288
4 Female 4 105 0.297
5 Male 1 108 0.209
6 Male 2 90 0.174
7 Male 3 152 0.295
8 Male 4 166 0.322
#Create visulization
JS_Gender = Gender_Job_table %>%
ggplot(aes(x = JobSatisfaction, y = proportion, fill = Gender)) +
geom_col(show.legend = TRUE, position = "dodge")+
ggtitle("Employees Job Satistifaction by Gender") + xlab("Job Satisfaction") + ylab("Proportion")
# Display Employees Per Job Satistifaction by Gender
ggplotly(JS_Gender)
#count and percent
Age_Job_table = employeeData %>%
count(Age, JobSatisfaction) %>%
group_by(Age) %>%
mutate(proportion = n/sum(n))
Age_Job_table
# A tibble: 161 x 4
# Groups: Age [43]
Age JobSatisfaction n proportion
<int> <int> <int> <dbl>
1 18 2 1 0.167
2 18 3 3 0.5
3 18 4 2 0.333
4 19 1 1 0.143
5 19 2 3 0.429
6 19 3 1 0.143
7 19 4 2 0.286
8 20 1 1 0.25
9 20 3 1 0.25
10 20 4 2 0.5
# ... with 151 more rows
#Create visualization
JS_Age = Age_Job_table %>%
ggplot(aes(x = JobSatisfaction, y = proportion, fill = Age)) +
geom_col(show.legend = TRUE, position = "dodge")+
ggtitle("Employees Job Satistifaction by Age") + xlab("Job Satisfaction") + ylab("Proportion")
#Display Employees Per Job Satisfaction by Age
ggplotly(JS_Age)
#count and percent
AttrOT_table = employeeData %>%
count(Attrition, OverTime) %>%
group_by(Attrition) %>%
mutate(proportion = n/sum(n))
AttrOT_table
# A tibble: 4 x 4
# Groups: Attrition [2]
Attrition OverTime n proportion
<chr> <chr> <int> <dbl>
1 No No 558 0.764
2 No Yes 172 0.236
3 Yes No 60 0.429
4 Yes Yes 80 0.571
#Create visualization
Attr_OT = AttrOT_table %>%
ggplot(aes(x = OverTime, y = proportion, fill = Attrition)) +
geom_col(show.legend = TRUE, position = "dodge")+
ggtitle("Employees Overtime by Attrition") + xlab("Overtime") + ylab("Proportion")
#Display Employees Per Overtime by Attrition
ggplotly(Attr_OT)
#count and percent
AttrJob_table = employeeData %>%
count(JobRole,Attrition) %>%
group_by(JobRole) %>%
mutate(proportion = n/sum(n))
AttrJob_table
# A tibble: 18 x 4
# Groups: JobRole [9]
JobRole Attrition n proportion
<chr> <chr> <int> <dbl>
1 Healthcare Representative No 68 0.895
2 Healthcare Representative Yes 8 0.105
3 Human Resources No 21 0.778
4 Human Resources Yes 6 0.222
5 Laboratory Technician No 123 0.804
6 Laboratory Technician Yes 30 0.196
7 Manager No 47 0.922
8 Manager Yes 4 0.0784
9 Manufacturing Director No 85 0.977
10 Manufacturing Director Yes 2 0.0230
11 Research Director No 50 0.980
12 Research Director Yes 1 0.0196
13 Research Scientist No 140 0.814
14 Research Scientist Yes 32 0.186
15 Sales Executive No 167 0.835
16 Sales Executive Yes 33 0.165
17 Sales Representative No 29 0.547
18 Sales Representative Yes 24 0.453
# Create visualization
Attr_Job = AttrJob_table %>%
ggplot(aes(x = Attrition, y = proportion, fill = JobRole)) +
geom_col(show.legend = TRUE, position = "dodge")+
ggtitle("Employees Attrition by Job Role") + xlab("Job Role") + ylab("Proportion")
#Display Employees Attrition By Job Role
ggplotly(Attr_Job)
#count and percent
OTJob_table = employeeData %>%
count(JobRole, OverTime) %>%
group_by(JobRole) %>%
mutate(proportion = n/sum(n))
OTJob_table
# A tibble: 18 x 4
# Groups: JobRole [9]
JobRole OverTime n proportion
<chr> <chr> <int> <dbl>
1 Healthcare Representative No 54 0.711
2 Healthcare Representative Yes 22 0.289
3 Human Resources No 21 0.778
4 Human Resources Yes 6 0.222
5 Laboratory Technician No 120 0.784
6 Laboratory Technician Yes 33 0.216
7 Manager No 41 0.804
8 Manager Yes 10 0.196
9 Manufacturing Director No 64 0.736
10 Manufacturing Director Yes 23 0.264
11 Research Director No 35 0.686
12 Research Director Yes 16 0.314
13 Research Scientist No 107 0.622
14 Research Scientist Yes 65 0.378
15 Sales Executive No 141 0.705
16 Sales Executive Yes 59 0.295
17 Sales Representative No 35 0.660
18 Sales Representative Yes 18 0.340
#Create Visualization
OTJob = OTJob_table %>%
ggplot(aes(x = OverTime, y = proportion, fill = JobRole)) +
geom_col(show.legend = TRUE, position = "dodge")+
ggtitle("Employees Overtime by Job Role") + xlab("Overtime") + ylab("Proportion")
#Display Employees Overtime By Job Role
ggplotly(OTJob)
#count and percent
AttrAge_table = employeeData %>%
count(Age, Attrition) %>%
group_by(Age) %>%
mutate(proportion = n/sum(n))
AttrAge_table
# A tibble: 81 x 4
# Groups: Age [43]
Age Attrition n proportion
<int> <chr> <int> <dbl>
1 18 No 2 0.333
2 18 Yes 4 0.667
3 19 No 3 0.429
4 19 Yes 4 0.571
5 20 No 1 0.25
6 20 Yes 3 0.75
7 21 No 4 0.667
8 21 Yes 2 0.333
9 22 No 5 0.625
10 22 Yes 3 0.375
# ... with 71 more rows
#Create visualization
AttrAge = AttrAge_table %>%
ggplot(aes(x = Attrition, y = proportion, fill = Age)) +
geom_col(show.legend = TRUE, position = "dodge")+
ggtitle("Employees Attrition By Age") + xlab("Attrition") + ylab("Proportion")
#Display Employees Attrition By Age
ggplotly(AttrAge)
# count and percent
Attr_MI_table = employeeData %>%
count(MonthlyIncome, Attrition) %>%
group_by(MonthlyIncome) %>%
mutate(proportion = n/sum(n))
Attr_MI_table
# A tibble: 839 x 4
# Groups: MonthlyIncome [826]
MonthlyIncome Attrition n proportion
<int> <chr> <int> <dbl>
1 1081 Yes 1 1
2 1091 Yes 1 1
3 1102 Yes 1 1
4 1118 Yes 1 1
5 1129 No 1 1
6 1223 No 1 1
7 1274 No 1 1
8 1281 No 1 1
9 1393 Yes 1 1
10 1420 Yes 1 1
# ... with 829 more rows
# visualization
Attr_MI = Attr_MI_table %>%
ggplot(aes(x = Attrition, y = proportion, fill = MonthlyIncome)) +
geom_col(show.legend = TRUE, position = "dodge")+
ggtitle("Employees Attrition by Monthly Income") + xlab("Attrition") + ylab("Proportion")
#Display Employees Per Attrition by Monthly Income
ggplotly(Attr_MI)
# data prep
df = employeeData
summary(df)
ID Age Attrition BusinessTravel
Min. : 1.0 Min. :18.00 Length:870 Length:870
1st Qu.:218.2 1st Qu.:30.00 Class :character Class :character
Median :435.5 Median :35.00 Mode :character Mode :character
Mean :435.5 Mean :36.83
3rd Qu.:652.8 3rd Qu.:43.00
Max. :870.0 Max. :60.00
DailyRate Department DistanceFromHome Education
Min. : 103.0 Length:870 Min. : 1.000 Min. :1.000
1st Qu.: 472.5 Class :character 1st Qu.: 2.000 1st Qu.:2.000
Median : 817.5 Mode :character Median : 7.000 Median :3.000
Mean : 815.2 Mean : 9.339 Mean :2.901
3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
Max. :1499.0 Max. :29.000 Max. :5.000
EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
Length:870 Min. :1 Min. : 1.0 Min. :1.000
Class :character 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
Mode :character Median :1 Median :1039.0 Median :3.000
Mean :1 Mean :1029.8 Mean :2.701
3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
Max. :1 Max. :2064.0 Max. :4.000
Gender HourlyRate JobInvolvement JobLevel
Length:870 Min. : 30.00 Min. :1.000 Min. :1.000
Class :character 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
Mode :character Median : 66.00 Median :3.000 Median :2.000
Mean : 65.61 Mean :2.723 Mean :2.039
3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
Max. :100.00 Max. :4.000 Max. :5.000
JobRole JobSatisfaction MaritalStatus MonthlyIncome
Length:870 Min. :1.000 Length:870 Min. : 1081
Class :character 1st Qu.:2.000 Class :character 1st Qu.: 2840
Mode :character Median :3.000 Mode :character Median : 4946
Mean :2.709 Mean : 6390
3rd Qu.:4.000 3rd Qu.: 8182
Max. :4.000 Max. :19999
MonthlyRate NumCompaniesWorked Over18 OverTime
Min. : 2094 Min. :0.000 Length:870 Length:870
1st Qu.: 8092 1st Qu.:1.000 Class :character Class :character
Median :14074 Median :2.000 Mode :character Mode :character
Mean :14326 Mean :2.728
3rd Qu.:20456 3rd Qu.:4.000
Max. :26997 Max. :9.000
PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours
Min. :11.0 Min. :3.000 Min. :1.000 Min. :80
1st Qu.:12.0 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80
Median :14.0 Median :3.000 Median :3.000 Median :80
Mean :15.2 Mean :3.152 Mean :2.707 Mean :80
3rd Qu.:18.0 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80
Max. :25.0 Max. :4.000 Max. :4.000 Max. :80
StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
Min. :0.0000 Min. : 0.00 Min. :0.000 Min. :1.000
1st Qu.:0.0000 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000
Median :1.0000 Median :10.00 Median :3.000 Median :3.000
Mean :0.7839 Mean :11.05 Mean :2.832 Mean :2.782
3rd Qu.:1.0000 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000
Max. :3.0000 Max. :40.00 Max. :6.000 Max. :4.000
YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 0.000
Median : 5.000 Median : 3.000 Median : 1.000
Mean : 6.962 Mean : 4.205 Mean : 2.169
3rd Qu.:10.000 3rd Qu.: 7.000 3rd Qu.: 3.000
Max. :40.000 Max. :18.000 Max. :15.000
YearsWithCurrManager
Min. : 0.00
1st Qu.: 2.00
Median : 3.00
Mean : 4.14
3rd Qu.: 7.00
Max. :17.00
df = df %>%
dplyr::select(-ID, - EmployeeCount, - EmployeeNumber, -Over18) %>%
mutate_if(is.character, factor)
## Create train and test sets
set.seed(120)
trainIndices = sample(seq(1, nrow(df), by = 1),(.7*nrow(df)))
trainData = df[trainIndices,]
testData = df[-trainIndices,]
# Naive Bayes model
model.nb = naiveBayes(Attrition ~ ., data = df, positive = "Yes")
# model summary
summary(model.nb )
Length Class Mode
apriori 2 table numeric
tables 31 -none- list
levels 2 -none- character
isnumeric 31 -none- logical
call 5 -none- call
model.nb$apriori
Y
No Yes
730 140
# Prediction
predData = predict(model.nb, testData)
table(predData)
predData
No Yes
214 47
predData = factor(predData)
# Accuracy
confusionMatrix(predData,testData$Attrition, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 202 12
Yes 22 25
Accuracy : 0.8697
95% CI : (0.8227, 0.9081)
No Information Rate : 0.8582
P-Value [Acc > NIR] : 0.3349
Kappa : 0.5189
Mcnemar's Test P-Value : 0.1227
Sensitivity : 0.67568
Specificity : 0.90179
Pos Pred Value : 0.53191
Neg Pred Value : 0.94393
Prevalence : 0.14176
Detection Rate : 0.09579
Detection Prevalence : 0.18008
Balanced Accuracy : 0.78873
'Positive' Class : Yes
The model requirements were atleast a 60% sensitivity and specificity for the training and validation set. The model met these requirements with a 86.97% accuracy, 67.57% sensitivity and 90.18%.
# data prep
df = employeeData
summary(df)
ID Age Attrition BusinessTravel
Min. : 1.0 Min. :18.00 Length:870 Length:870
1st Qu.:218.2 1st Qu.:30.00 Class :character Class :character
Median :435.5 Median :35.00 Mode :character Mode :character
Mean :435.5 Mean :36.83
3rd Qu.:652.8 3rd Qu.:43.00
Max. :870.0 Max. :60.00
DailyRate Department DistanceFromHome Education
Min. : 103.0 Length:870 Min. : 1.000 Min. :1.000
1st Qu.: 472.5 Class :character 1st Qu.: 2.000 1st Qu.:2.000
Median : 817.5 Mode :character Median : 7.000 Median :3.000
Mean : 815.2 Mean : 9.339 Mean :2.901
3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
Max. :1499.0 Max. :29.000 Max. :5.000
EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
Length:870 Min. :1 Min. : 1.0 Min. :1.000
Class :character 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
Mode :character Median :1 Median :1039.0 Median :3.000
Mean :1 Mean :1029.8 Mean :2.701
3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
Max. :1 Max. :2064.0 Max. :4.000
Gender HourlyRate JobInvolvement JobLevel
Length:870 Min. : 30.00 Min. :1.000 Min. :1.000
Class :character 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
Mode :character Median : 66.00 Median :3.000 Median :2.000
Mean : 65.61 Mean :2.723 Mean :2.039
3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
Max. :100.00 Max. :4.000 Max. :5.000
JobRole JobSatisfaction MaritalStatus MonthlyIncome
Length:870 Min. :1.000 Length:870 Min. : 1081
Class :character 1st Qu.:2.000 Class :character 1st Qu.: 2840
Mode :character Median :3.000 Mode :character Median : 4946
Mean :2.709 Mean : 6390
3rd Qu.:4.000 3rd Qu.: 8182
Max. :4.000 Max. :19999
MonthlyRate NumCompaniesWorked Over18 OverTime
Min. : 2094 Min. :0.000 Length:870 Length:870
1st Qu.: 8092 1st Qu.:1.000 Class :character Class :character
Median :14074 Median :2.000 Mode :character Mode :character
Mean :14326 Mean :2.728
3rd Qu.:20456 3rd Qu.:4.000
Max. :26997 Max. :9.000
PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours
Min. :11.0 Min. :3.000 Min. :1.000 Min. :80
1st Qu.:12.0 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80
Median :14.0 Median :3.000 Median :3.000 Median :80
Mean :15.2 Mean :3.152 Mean :2.707 Mean :80
3rd Qu.:18.0 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80
Max. :25.0 Max. :4.000 Max. :4.000 Max. :80
StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
Min. :0.0000 Min. : 0.00 Min. :0.000 Min. :1.000
1st Qu.:0.0000 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000
Median :1.0000 Median :10.00 Median :3.000 Median :3.000
Mean :0.7839 Mean :11.05 Mean :2.832 Mean :2.782
3rd Qu.:1.0000 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000
Max. :3.0000 Max. :40.00 Max. :6.000 Max. :4.000
YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 0.000
Median : 5.000 Median : 3.000 Median : 1.000
Mean : 6.962 Mean : 4.205 Mean : 2.169
3rd Qu.:10.000 3rd Qu.: 7.000 3rd Qu.: 3.000
Max. :40.000 Max. :18.000 Max. :15.000
YearsWithCurrManager
Min. : 0.00
1st Qu.: 2.00
Median : 3.00
Mean : 4.14
3rd Qu.: 7.00
Max. :17.00
df = df %>%
dplyr::select(-ID, - EmployeeCount, - EmployeeNumber, -Over18) %>%
mutate_if(is.character, factor)
# Random Forest method
model.rf = randomForest(Attrition ~ ., ntree = 100, keep.forest=FALSE,
data = df,
importance = TRUE)
model.rf
Call:
randomForest(formula = Attrition ~ ., data = df, ntree = 100, keep.forest = FALSE, importance = TRUE)
Type of random forest: classification
Number of trees: 100
No. of variables tried at each split: 5
OOB estimate of error rate: 14.25%
Confusion matrix:
No Yes class.error
No 720 10 0.01369863
Yes 114 26 0.81428571
# Importance Variable Plot
Imp_Var = varImp(model.rf)
(varImpPlot(model.rf))
MeanDecreaseAccuracy MeanDecreaseGini
Age 3.5973985 13.376111
BusinessTravel -0.5630967 3.023952
DailyRate 1.5883593 11.544679
Department 2.4159246 2.293979
DistanceFromHome 1.6180845 11.466528
Education 1.7145657 4.719197
EducationField 1.5976790 8.391209
EnvironmentSatisfaction 1.6543868 6.055542
Gender 0.3923474 1.562547
HourlyRate -0.2347427 10.024692
JobInvolvement 3.7416241 8.373097
JobLevel 4.0980405 4.368232
JobRole 4.5790442 13.026309
JobSatisfaction -0.5504987 5.973957
MaritalStatus 4.9904910 5.336642
MonthlyIncome 6.5021419 18.317048
MonthlyRate -2.0438151 11.147896
NumCompaniesWorked 2.4167839 7.606961
OverTime 8.8362658 13.257364
PercentSalaryHike 2.0237277 8.890304
PerformanceRating 0.2388280 1.180633
RelationshipSatisfaction -0.7918954 5.370032
StandardHours 0.0000000 0.000000
StockOptionLevel 5.4565358 8.810542
TotalWorkingYears 4.5280657 11.792019
TrainingTimesLastYear 0.8565250 5.792272
WorkLifeBalance 1.5585349 6.054656
YearsAtCompany 3.5529660 9.731998
YearsInCurrentRole 2.3545268 5.611045
YearsSinceLastPromotion 2.8582968 6.120367
YearsWithCurrManager 4.6991491 6.495443
## Load the validation data set that does not include Attrition
valData = read.csv("CaseStudy2CompSet No Attrition.csv")
#View(valData)
## Export the predicted results from my model into a .csv file for submission
validationPrediction <- predict(model.nb, valData)
table(validationPrediction)
validationPrediction
No Yes
249 51
# create output dataset
output = valData %>% select(ID)
output = output %>%
mutate(Attrition = validationPrediction)
# save dataset as a csv
write.csv(output, file = "Case2PredictionsDHerring Attrition.csv", row.names = FALSE)
Based on the model to the right, it may be predicted that the top three contributing factors to attrition at Frito Lay are Monthly Income, Overtime and Age.
## Run a linear regression with the cleaned data set with Monthly Income for Validation Requirement
fitMonthlyIncome = lm(MonthlyIncome ~ ., data = df)
summary(fitMonthlyIncome)
Call:
lm(formula = MonthlyIncome ~ ., data = df)
Residuals:
Min 1Q Median 3Q Max
-3680.7 -660.4 7.4 625.3 4114.4
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.322e+01 7.725e+02 0.082 0.93479
Age -1.430e+00 5.659e+00 -0.253 0.80049
AttritionYes 8.245e+01 1.156e+02 0.714 0.47573
BusinessTravelTravel_Frequently 1.956e+02 1.422e+02 1.375 0.16950
BusinessTravelTravel_Rarely 3.777e+02 1.202e+02 3.143 0.00173 **
DailyRate 1.449e-01 9.138e-02 1.586 0.11312
DepartmentResearch & Development 1.205e+02 4.774e+02 0.252 0.80083
DepartmentSales -4.485e+02 4.885e+02 -0.918 0.35883
DistanceFromHome -6.712e+00 4.577e+00 -1.466 0.14290
Education -3.377e+01 3.718e+01 -0.908 0.36398
EducationFieldLife Sciences 1.294e+02 3.695e+02 0.350 0.72633
EducationFieldMarketing 1.039e+02 3.915e+02 0.266 0.79067
EducationFieldMedical 1.976e+01 3.704e+02 0.053 0.95746
EducationFieldOther 7.569e+01 3.952e+02 0.192 0.84816
EducationFieldTechnical Degree 8.523e+01 3.848e+02 0.221 0.82476
EnvironmentSatisfaction -4.545e+00 3.369e+01 -0.135 0.89271
GenderMale 1.112e+02 7.454e+01 1.492 0.13606
HourlyRate -3.812e-01 1.827e+00 -0.209 0.83478
JobInvolvement 1.807e+01 5.328e+01 0.339 0.73450
JobLevel 2.786e+03 8.353e+01 33.356 < 2e-16 ***
JobRoleHuman Resources -2.054e+02 5.156e+02 -0.398 0.69052
JobRoleLaboratory Technician -6.021e+02 1.715e+02 -3.512 0.00047 ***
JobRoleManager 4.280e+03 2.835e+02 15.099 < 2e-16 ***
JobRoleManufacturing Director 1.742e+02 1.697e+02 1.027 0.30480
JobRoleResearch Director 4.056e+03 2.193e+02 18.489 < 2e-16 ***
JobRoleResearch Scientist -3.482e+02 1.704e+02 -2.043 0.04135 *
JobRoleSales Executive 5.179e+02 3.579e+02 1.447 0.14830
JobRoleSales Representative 8.120e+01 3.923e+02 0.207 0.83605
JobSatisfaction 2.736e+01 3.339e+01 0.819 0.41288
MaritalStatusMarried 6.666e+01 1.001e+02 0.666 0.50555
MaritalStatusSingle 1.520e+01 1.355e+02 0.112 0.91072
MonthlyRate -9.243e-03 5.148e-03 -1.796 0.07294 .
NumCompaniesWorked 4.915e+00 1.693e+01 0.290 0.77164
OverTimeYes -1.536e+01 8.446e+01 -0.182 0.85577
PercentSalaryHike 2.520e+01 1.583e+01 1.592 0.11187
PerformanceRating -3.247e+02 1.617e+02 -2.008 0.04494 *
RelationshipSatisfaction 1.621e+01 3.331e+01 0.487 0.62665
StandardHours NA NA NA NA
StockOptionLevel 4.062e+00 5.695e+01 0.071 0.94316
TotalWorkingYears 5.124e+01 1.099e+01 4.661 3.66e-06 ***
TrainingTimesLastYear 2.375e+01 2.917e+01 0.814 0.41574
WorkLifeBalance -3.616e+01 5.169e+01 -0.700 0.48441
YearsAtCompany -4.709e+00 1.363e+01 -0.345 0.72990
YearsInCurrentRole 5.629e+00 1.703e+01 0.330 0.74111
YearsSinceLastPromotion 3.048e+01 1.534e+01 1.987 0.04723 *
YearsWithCurrManager -2.576e+01 1.670e+01 -1.542 0.12341
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1057 on 825 degrees of freedom
Multiple R-squared: 0.9498, Adjusted R-squared: 0.9471
F-statistic: 354.9 on 44 and 825 DF, p-value: < 2.2e-16
## Load the validation data set that does not include MonthlyIncome
valData.MI = read_excel("CaseStudy2CompSet No Salary.xlsx")
## Export the predicted results from my model into a .csv file for submission
validationPrediction.MI = predict(fitMonthlyIncome, valData.MI)
output2 = valData.MI %>% select(ID)
output2 = output2 %>%
mutate(MonthlyIncome = validationPrediction.MI)
write.csv(output2, file = "Case2PredictionsDHerring Salary.csv", row.names = FALSE)
The model met the required RMSE of less than $3,000 with a RMSE of $1,057.